##### Descriptive Text Data #####
#Remove objects
rm(list = ls())

#Load pkgs#
library(quanteda)
library(tidyverse)
library(xtable)
library(corrplot)
library(fastDummies)
library(ggcorrplot)
library(readxl)

#Load Document-Frequency-Matrix of news outlets, including topic distributions on headline level
dfm <- readRDS("input/text_corpus.rds")

#Load Labels
labels <- read_delim("input/labels.csv",";", escape_double = FALSE, trim_ws = TRUE)

#Read main dataset
#Load main dataset
df_merged_mean <- read_delim("input/df_merged_mean.csv", 
                             ";", escape_double = FALSE, col_types = cols(date_text = col_date(format = "%d/%m/%Y")), 
                             trim_ws = TRUE)

#Table 2: Top 10 identified topics
#Load labeled topics and translated topics file from Excel file
topics_50_labeled <- read_excel("input/topics50_labeled.xlsx")
topics_50_labeled$`p(z)`<-as.numeric(topics_50_labeled$`p(z)`)

print(xtable(topics_50_labeled[1:10,c(1,5,3)],digits=3),
      type="html",
      file="output/tables/Table_2_Top10.html")


#Table 3: Categorization of topics and presentation of results 
#Assign full names
topics_names <- c(
  "General opinion","Sanctions","National assembly","Maduro","Opposition candidate",
  "Government-opposition dialog","Election","Migration crisis","Protest/shortages",
  "Oscar Perez/repression","Earthquake/accidents","International Organizations","Government ministers",
  "Political Prisoners","USA/Korea","Petroleum","Resignations","Corruption","Salary/prices",
  "Colombia border","Shortages healthcare","Exchange rate","Shortages","Russia","Outages",
  "Exiled opposition","Music/entertainment","Child mortality","Court sentences/Lula da Silva","PDVSA",
  "Church/job offer (mixed)","Food policy","Airline (opening/closing)","Indigenous groups/diseases",
  "Education Studies","Mining/sport (mixed)","Economy policy","Recession","Cryptomoney","Economy opinion",
  "Military","Work opinion","Cuba","Weather","Leftist opposition","Regulations","Investment/infrastructure",
  "Investment","Spanish development aid","Financial market")

category_table <- cbind(labels,topics_names)

category_table$category <- ifelse(topics_names %in% c("Protest/shortages","Oscar Perez/repression",
                                                      "Political Prisoners","Military"),
                                  "Protest/repression","Social/economic crisis")
category_table$category <- ifelse(topics_names %in% c("National assembly","International Organizations",
                                                      "Resignations","Corruption","Exiled opposition","Leftist opposition",
                                                      "Election","Opposition candidate"),
                                  "Political legitimacy",category_table$category )
category_table$category <- ifelse(topics_names %in% c("Maduro","Government-opposition dialog",
                                                      "Government ministers",
                                                      "Food policy","Economy policy","Cryptomoney",
                                                      "Regulations","General opinion","Spanish development aid"),
                                  "Domestic politics",category_table$category )
category_table$category <- ifelse(topics_names %in% c("Education Studies","Music/entertainment",
                                                      "Weather","Earthquake/accidents","Church/job offer (mixed)"
),
"Nonpolitical",category_table$category)

category_table$category <- ifelse(topics_names %in% c("Russia","Colombia border","USA/Korea","Cuba","Court sentences/Lula da Silva"
),"International politics",category_table$category)

topics <- rbind(paste(as.vector(dplyr::filter(category_table,category=="Domestic politics")$topics_names),collapse = ", "),
                 paste(as.vector(dplyr::filter(category_table,category=="International politics")$topics_names),collapse = ", "),
                 paste(as.vector(dplyr::filter(category_table,category=="Nonpolitical")$topics_names),collapse = ", "),
                paste(as.vector(dplyr::filter(category_table,category=="Political legitimacy")$topics_names),collapse = ", "),
                 paste(as.vector(dplyr::filter(category_table,category=="Protest/repression")$topics_names),collapse = ", "),
                 paste(as.vector(dplyr::filter(category_table,category=="Social/economic crisis")$topics_names),collapse = ", "))
                 
category_table <- category_table %>%
  group_by(category)%>%
  summarise_if(is.numeric,funs(sum),na.rm = TRUE)

category_table$topics <- topics

#Order along share
category_table <- category_table %>% arrange(desc(`p(z)`))


#Share of pos. and sign. related (alpha=0.05/281) topics as determined in the "analysis.R" file
#not considering left out topics
category_table$share <- c("16.7%","20%","57.1%","75%","20%","0%")

print(xtable(category_table,digits = 3),include.rownames=F,
      type="html", file="output/tables/Table_3_categorization.html"
      )

print(xtable(category_table,digits = 3),include.rownames=F,
      type="html", file="output/tables/Table_C1_categorization.html"
)


##### For Appendix E: Text Analysis and Topic Modeling#####

#Headlines per day/website

headlines_day_website <- dfm@docvars %>% 
  dplyr::count(website_id,date_text) %>% 
  group_by(website_id)

summary(headlines_day_website$n)
#35 as mean


summary_statistic <- as.data.frame(rbind(c(nrow(headlines_day_website),dfm@Dim[1],mean(headlines_day_website$n))))
summary_statistic <- rbind(summary_statistic,c("Avg. term lenght (headline)","Std. dev. term lenght (headline)","Total no. of stemmed terms"))
summary_statistic <- rbind(summary_statistic,c(round(mean(ntoken(dfm)),2),round(sd(ntoken(dfm)),2),dfm@Dim[2]))

names(summary_statistic) <- c("No. of website/days","No. of headlines","Avg. no. of headlines (website/day)")


#Table E.1: Summary statistics of text corpus.
print(xtable(summary_statistic,digits = 2),include.rownames=FALSE,
      type="html", file="output/tables/Table_E1_summary_text.html"
)

#Explore semantic validity as judged in Table E.2: Generated Topics (K=50)
#Semantic validity#
get_top_docs <- function(i){
  final_data_order <- dfm@docvars %>% arrange_at(i,desc)
  toptext <- final_data_order$text[1:25]
  return(toptext)
}

numbers <- seq(9:58)+9
top_docs <- sapply(numbers, get_top_docs)
top_docs <- as.data.frame(top_docs)
colnames(top_docs) <- names(dfm@docvars)[10:59] #Link headlines to respective topics
#Information used to judge semantic validity of the created topics in Model E.2
write.csv(top_docs,"output/top_docs.csv")

#Example temporal patterns across websites for the topic recession
#Figure E.2:  Development of the topic recession in Venezuela November 2017 - June 2018
#Aggregate on the date and website_id level
final_data_date_id <- df_merged_mean %>%
  group_by(date_text,name)%>%
  summarise_all(funs(mean),na.rm = TRUE)

ggplot(final_data_date_id) +
  geom_col(aes(date_text,recession),col="grey40")+
  geom_vline(aes(xintercept=as.numeric(as.Date("2018-05-20"))),color="black",lwd=1,lty="dashed")+
  geom_vline(aes(xintercept=as.numeric(as.Date("2017-12-10"))),color="black",lwd=1,lty="dashed")+
  scale_x_date(date_breaks = "1 month", date_labels ="%b") + facet_wrap(~name,scales="fixed",ncol=4,nrow=5)+
  xlab("Date") + ylab("Overall proportion of topic") + theme_bw()

ggsave("output/figures/Figure_E2_financial.pdf",width=9, height=5)

#Predictive validity of some topics with external events
## Aggregate to date only 
final_data_date <- dfm@docvars %>%
  group_by(date_text)%>%
  summarise_all(funs(mean),na.rm = TRUE)


#Plot examples:
#Electoral procedures (Figure  E.3:   Temporal  development  of  the  aggregated  topic election)
ggplot(final_data_date) +
  geom_col(aes(date_text,electoral_procedures),col="grey40")+
  geom_vline(aes(xintercept=as.numeric(as.Date("2018-05-20"))),color="black",lwd=1,lty="dashed")+
  geom_vline(aes(xintercept=as.numeric(as.Date("2017-12-10"))),color="black",lwd=1,lty="dashed")+
  scale_x_date(date_breaks = "1 month", date_labels ="%b") +
  xlab("Date") + ylab("Overall proportion of topic") + theme_bw()

ggsave("output/figures/Figure_E3_election.pdf",width=9, height=5)


#Oscar Perez (Figure E.4: Temporal development of the aggregated topic ́Oscar P ́erez/repression.)
ggplot(final_data_date) +
  #  geom_point(aes(date_text,factor(off,labels = c("No","Yes")),color = factor(off)))+
  geom_col(aes(date_text,oscar_perez),col="grey40")+
  geom_vline(aes(xintercept=as.numeric(as.Date("2018-01-15"))),color="black",lwd=1,lty="dashed")+
  scale_x_date(date_breaks = "1 month", date_labels ="%b") +
  xlab("Date") + ylab("Overall proportion of topic") + theme_bw()

ggsave("output/figures/Figure_E4_oscar.pdf",width=9, height=5)


#Cross correlations figures#

#Create Dummies for each website and add to main dataset
df_merged_mean <- fastDummies::dummy_cols(df_merged_mean, select_columns =
                                            c("website_id"), remove_first_dummy=F)


#First correlation matrix
corr <- round(cor(df_merged_mean[,c(6,19:68,129:147)],use="pairwise"), 1)
head(corr[, 1:4])

#Exclude topics that correlate strongly (>=0.6) to specific websites
index <- which(abs(corr) > .5 & abs(corr) < 1, arr.ind = T)

#Create p-values for correlation matrix
p.mat <- cor_pmat(df_merged_mean[,c(6,19:68,129:147)],use="pairwise")
head(p.mat[, 1:4])

#Names to plot
topic_website_names_output <- c("DoS attack","General opinion","Salary/prices","Economy opinion","Food policy","Investment","Exiled opposition","Court sentences",
                                "Economy policy","Child mortality","Spanish development aid","Outages","Weather","Election","Government-Opposition dialog",
                                "Music/entertainment","Government ministers","Investment/infrastructure","Protest/shortages",
                                "Leftist opposition",
                                "Colombia border","Oscar Perez", "Indigenous groups/diseases","Church/Job offer (mixed)","National assembly",
                                "Airline (opening/closing)",
                                "Political Prisoners","Russia","Sanctions","Military","Cryptomoney","Shortages healthcare","Regulations",
                                "USA/Korea","Cuba","International Organizations","Exchange Rate","Maduro","Work opinion","Migration crisis",
                                "Petroleum","PDVSA","Resignations","Mining/Sport (mixed)","Opposition candidate","Earthquake/accidents",
                                "Shortages","Recession",'Financial market',"Corruption","Education studies",
                                "www.globovision.com","www.radiofeyalegrianoticias.net","www.televen.com","unionradio.net",
                                "www.noticierovenevision.net","www.analitica.com","www.aporrea.org","canaldenoticia.com","confirmado.com.ve",
                                "dolartoday.com","elperiodicovenezolano.com","informe21.com","www.lapatilla.com","www.notiactual.com",
                                "www.noticierodigital.com","www.dinero.com.ve",
                                "www.2001.com.ve","www.el-nacional.com","www.eluniversal.com")


#Show correlation between topics
colnames(corr) <- topic_website_names_output
rownames(corr) <- topic_website_names_output
colnames(p.mat) <- topic_website_names_output
rownames(p.mat) <- topic_website_names_output

pdf(file="output/figures/Figure_E1_correlation.pdf",width=8, height=8)
corrplot::corrplot(corr, p.mat = p.mat, insig = "blank", type = "upper", tl.pos = "td",
                   method = "circle", tl.col = 'black',
                   addCoef.col="black", 
                   order = "FPC", tl.cex = 0.5,number.cex = .25
                   ,cl.pos = "n")
dev.off()
#when insig = "n"; pNew error due to insign. displayed value, plot still showing the correct results
# changed to blank in order to avoid error and not showing insign. values

#Table E.2: Labeled Topics (K=50))
print(xtable(topics_50_labeled),include.rownames=FALSE,
      type="html", file="output/tables/Table_E1_summary_text50.html"
)

#Table E.3 Labeled Topics (K=25) and Table E.4 Generated Topics (K=100) 
topics25_labeled <- read_excel("input/topics25_labeled.xlsx", 
                                 col_types = c("text", "skip", "text"))

print(xtable(topics25_labeled),include.rownames=FALSE,
      type="html", file="output/tables/Table_E3_summary_text25.html"
)

topics100_labeled <- read_excel("input/topics100_labeled.xlsx", 
                               col_types = c("text", "skip", "text"))

print(xtable(topics100_labeled),include.rownames=FALSE,
      type="html", file="output/tables/Table_E4_summary_text100.html"
)